import numpy as npimport pandas as pd#import seaborn as snsimport matplotlib.pyplot as pltimport plotly.express as pxdata=pd.read_csv("D:\\titanic dataset.csv",sep=",",encoding="utf-8")data.head()data.info()data.describe()data.describe(include="object").Tdata.isnull().sum()plt.figure(figsize=(7,4))sns.heatmap(data.isnull())data[data["Age"]<=0].shapedata["PassengerId"].duplicated().sum()numerical_data = []object_data = []for column in data.columns: if data.dtypes[column] != 'object': numerical_data.append(column) else: object_data.append(column)numerical_datafrom sklearn. impute import KNNImputerimputer = KNNImputer(n_neighbors=5)data[numerical_data] = imputer.fit_transform(data[numerical_data])data.isnull().sum()for column in data.columns: missing_indices = data[data[column].isnull()].index available_values = data[column].dropna() for index in missing_indices: random_choice = np.random.choice(available_values) data.at[index, column] = random_choicedata.isnull().sum()data['Fare']=data['Fare'].round(2)plt.figure(figsize=(7,4))sns.heatmap(data.isnull())plt.figure(figsize=(7,5))px.box(data_frame=data, y="Fare")sns.distplot(data["Fare"])plt.show()#find the limitsupper_limit=data["Fare"].mean() + 3*data["Fare"].std()lower_limit=data["Fare"].mean() - 3*data["Fare"].std()print("upper limit: ",upper_limit)print("lower limit: ",lower_limit)#find the outliersoutliers_df=data.loc[(data["Fare"]> upper_limit) |(data["Fare"] < lower_limit)]outliers_df.shape#remove outliers from the datanew_df =data.loc[(data["Fare"]< upper_limit) & (data["Fare"] > lower_limit)]print("before removing the outliers: ",len(data))print("after removing the outliers: ",len(new_df))print("the outliers: ",len(data)-len(new_df))plt.figure(figsize=(7,5))px.box(data_frame=new_df, y="Fare")sns.distplot(new_df["Fare"])plt.show()file_path="new df.csv"num_columns=list(data.drop(['Pclass','Survived'],axis=1).select_dtypes(include=['float','int']).columns)num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2fig ,ax = plt.subplots(num,num,figsize=(12,10))for j in range(num): for i in range(num): try: sns.histplot(data=df,x=num_columns[0],kde=True,bins=20,ax=ax[j][i]) num_columns.pop(0) except: fig.delaxes(ax=ax[j][i])fig.suptitle('Histograms of numerical columns', fontsize=16)plt.show()cat_columns=list(data.select_dtypes(include=['object']).columns)num=int(len(cat_columns)/2) if int(len(cat_columns)/2)>1 else 2fig ,ax = plt.subplots(num,num,figsize=(12,10))for j in range(num): for i in range(num): try: sns.countplot(data=df,x=cat_columns[0],ax=ax[j][i],hue='Survived') cat_columns.pop(0) except: fig.delaxes(ax=ax[j][i])fig.suptitle('Histograms of numerical columns', fontsize=16)plt.show()num_columns=list(data.drop(['Pclass','Survived'],axis=1).select_dtypes(include=['float','int']).columns)num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2fig ,ax = plt.subplots(num,num,figsize=(12,10))for j in range(num): for i in range(num): try: sns.histplot(data=df,x=num_columns[0],kde=True,bins=20,ax=ax[j][i]) num_columns.pop(0) except: fig.delaxes(ax=ax[j][i])fig.suptitle('Histograms of numerical columns', fontsize=16)plt.show()num_columns=list(data.drop(['Pclass','Survived'],axis=1).select_dtypes(include=['float','int']).columns)num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2fig ,ax = plt.subplots(num,num,figsize=(12,10))for j in range(num): for i in range(num): try: sns.boxplot(data=df,x=num_columns[0],ax=ax[j][i]) num_columns.pop(0) except: fig.delaxes(ax=ax[j][i])fig.suptitle('Histograms of numerical columns', fontsize=16)plt.show()import scipy.stats as statsfrom sklearn import metricsfrom sklearn.metrics import classification_report%tensorflow_version 1.ximport tensflow as tftarget=data.Survivedage_groups = pd.cut(data['Age'], bins=range(0, 100, 10), include_lowest=True)pd.crosstab(age_groups,data.Survived).plot(kind='bar')plt.ylabel('No.of Passengers')plt.title('Age over Survival')plt.grid(color="red", linestyle=":", alpha=0.5)pd.crosstab(data.Sex,data.Survived).plot(kind='bar')plt.ylabel('No.of Passengers')plt.title('Sex over Survival')plt.grid(color="red", linestyle=":", alpha=0.5)fare_groups = pd.cut(data['Age'], bins=range(0, 550, 10), include_lowest=True)pd.crosstab(fare_groups,data.Survived).plot(kind='bar')plt.ylabel('No.of Passengers')plt.title('Fare over Survival')plt.grid(color="red", linestyle=":", alpha=0.5)from sklearn import treemodel = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)from sklearn import treemodel = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)inputs=data[['Pclass','Sex','Age','Fare']]inputsfrom sklearn.preprocessing import LabelEncoderobj=LabelEncoder()inputs['Sex']=obj.fit_transform(inputs['Sex'])inputs.head(4)from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(inputs,target, test_size=0.3)model.fit(x_train,y_train)model.score(x_test,y_test)model.predict([[2,1,22,100]])model.predict_proba([[2,1,22,100]])